#1 MVP We’ve looked at a few different ways in which we can build models this week, including how to prepare them properly. This weekend we’ll build a multiple linear regression model on a dataset which will need some preparation. The data has come from Kaggle and can be found in the data folder.
We want to model avocado sales. You’ll need to identify the target variable and use the tools we’ve worked with this week in order to prepare your dataset and find appropriate predictors. Once you’ve built your model use the validation techniques discussed on Wednesday to evaluate it.
#2 Extensions Build a decision tree to model the likelihood of a sale being of an organic avocado. Use k-means clustering to investigate potential relationships between the year and the average avocado price.
library(tidyverse)
#loading in data
avocado <- read.csv("data/avocado.csv")
# investigate structure and summary of data
str(avocado)
'data.frame': 18249 obs. of 14 variables:
$ X : int 0 1 2 3 4 5 6 7 8 9 ...
$ Date : Factor w/ 169 levels "2015-01-04","2015-01-11",..: 52 51 50 49 48 47 46 45 44 43 ...
$ AveragePrice: num 1.33 1.35 0.93 1.08 1.28 1.26 0.99 0.98 1.02 1.07 ...
$ Total.Volume: num 64237 54877 118220 78992 51040 ...
$ X4046 : num 1037 674 795 1132 941 ...
$ X4225 : num 54455 44639 109150 71976 43838 ...
$ X4770 : num 48.2 58.3 130.5 72.6 75.8 ...
$ Total.Bags : num 8697 9506 8145 5811 6184 ...
$ Small.Bags : num 8604 9408 8042 5677 5986 ...
$ Large.Bags : num 93.2 97.5 103.1 133.8 197.7 ...
$ XLarge.Bags : num 0 0 0 0 0 0 0 0 0 0 ...
$ type : Factor w/ 2 levels "conventional",..: 1 1 1 1 1 1 1 1 1 1 ...
$ year : int 2015 2015 2015 2015 2015 2015 2015 2015 2015 2015 ...
$ region : Factor w/ 54 levels "Albany","Atlanta",..: 1 1 1 1 1 1 1 1 1 1 ...
summary(avocado)
X Date AveragePrice Total.Volume
Min. : 0.00 2015-01-04: 108 Min. :0.440 Min. : 85
1st Qu.:10.00 2015-01-11: 108 1st Qu.:1.100 1st Qu.: 10839
Median :24.00 2015-01-18: 108 Median :1.370 Median : 107377
Mean :24.23 2015-01-25: 108 Mean :1.406 Mean : 850644
3rd Qu.:38.00 2015-02-01: 108 3rd Qu.:1.660 3rd Qu.: 432962
Max. :52.00 2015-02-08: 108 Max. :3.250 Max. :62505647
(Other) :17601
X4046 X4225 X4770 Total.Bags
Min. : 0 Min. : 0 Min. : 0 Min. : 0
1st Qu.: 854 1st Qu.: 3009 1st Qu.: 0 1st Qu.: 5089
Median : 8645 Median : 29061 Median : 185 Median : 39744
Mean : 293008 Mean : 295155 Mean : 22840 Mean : 239639
3rd Qu.: 111020 3rd Qu.: 150207 3rd Qu.: 6243 3rd Qu.: 110783
Max. :22743616 Max. :20470573 Max. :2546439 Max. :19373134
Small.Bags Large.Bags XLarge.Bags type
Min. : 0 Min. : 0 Min. : 0.0 conventional:9126
1st Qu.: 2849 1st Qu.: 127 1st Qu.: 0.0 organic :9123
Median : 26363 Median : 2648 Median : 0.0
Mean : 182195 Mean : 54338 Mean : 3106.4
3rd Qu.: 83338 3rd Qu.: 22029 3rd Qu.: 132.5
Max. :13384587 Max. :5719097 Max. :551693.7
year region
Min. :2015 Albany : 338
1st Qu.:2015 Atlanta : 338
Median :2016 BaltimoreWashington: 338
Mean :2016 Boise : 338
3rd Qu.:2017 Boston : 338
Max. :2018 BuffaloRochester : 338
(Other) :16221
# check for missing values
apply(avocado, 2, function(x) any(is.na(x) | is.infinite(x) | is.null(x)))
X Date AveragePrice Total.Volume X4046 X4225
FALSE FALSE FALSE FALSE FALSE FALSE
X4770 Total.Bags Small.Bags Large.Bags XLarge.Bags type
FALSE FALSE FALSE FALSE FALSE FALSE
year region
FALSE FALSE
# get rid of spaces in column names
names(avocado) <- make.names(names(avocado))
# make all column names lower case
for( i in colnames(avocado)) {
colnames(avocado)[which(colnames(avocado) == i)] = tolower(i)
}
avocado
library(lubridate)
avocado$year <- year(ymd(as.character(avocado$date)))
avocado$month <- month(ymd(as.character(avocado$date)))
avocado$week <- week(ymd(as.character(avocado$date)))
glimpse(avocado)
Observations: 18,249
Variables: 16
$ x [3m[38;5;246m<int>[39m[23m 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, …
$ date [3m[38;5;246m<fct>[39m[23m 2015-12-27, 2015-12-20, 2015-12-13, 2015-12-06, 2015-11-29, 20…
$ averageprice [3m[38;5;246m<dbl>[39m[23m 1.33, 1.35, 0.93, 1.08, 1.28, 1.26, 0.99, 0.98, 1.02, 1.07, 1.…
$ total.volume [3m[38;5;246m<dbl>[39m[23m 64236.62, 54876.98, 118220.22, 78992.15, 51039.60, 55979.78, 8…
$ x4046 [3m[38;5;246m<dbl>[39m[23m 1036.74, 674.28, 794.70, 1132.00, 941.48, 1184.27, 1368.92, 70…
$ x4225 [3m[38;5;246m<dbl>[39m[23m 54454.85, 44638.81, 109149.67, 71976.41, 43838.39, 48067.99, 7…
$ x4770 [3m[38;5;246m<dbl>[39m[23m 48.16, 58.33, 130.50, 72.58, 75.78, 43.61, 93.26, 80.00, 85.34…
$ total.bags [3m[38;5;246m<dbl>[39m[23m 8696.87, 9505.56, 8145.35, 5811.16, 6183.95, 6683.91, 8318.86,…
$ small.bags [3m[38;5;246m<dbl>[39m[23m 8603.62, 9408.07, 8042.21, 5677.40, 5986.26, 6556.47, 8196.81,…
$ large.bags [3m[38;5;246m<dbl>[39m[23m 93.25, 97.49, 103.14, 133.76, 197.69, 127.44, 122.05, 562.37, …
$ xlarge.bags [3m[38;5;246m<dbl>[39m[23m 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.…
$ type [3m[38;5;246m<fct>[39m[23m conventional, conventional, conventional, conventional, conven…
$ year [3m[38;5;246m<dbl>[39m[23m 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 20…
$ region [3m[38;5;246m<fct>[39m[23m Albany, Albany, Albany, Albany, Albany, Albany, Albany, Albany…
$ month [3m[38;5;246m<dbl>[39m[23m 12, 12, 12, 12, 11, 11, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9…
$ week [3m[38;5;246m<dbl>[39m[23m 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37…
avocado %>%
ggplot(aes(x = averageprice)) +
geom_histogram() +
facet_wrap(~ year)
# seems to be less data in 2018 compared to other years
#table(avocado$year)
avocado %>%
ggplot(aes(x = averageprice, fill = type)) +
geom_histogram() +
facet_wrap(~ month)
#table(avocado$month)
avocado %>%
ggplot(aes(x = averageprice, fill = type)) +
geom_histogram() +
facet_wrap(~ week)
#table(avocado$week)
avocado %>%
ggplot(aes(x = as.factor(year), y = averageprice, group = year)) +
geom_boxplot()
avocado %>%
ggplot(aes(x = as.factor(month), y = averageprice, group = month)) +
geom_boxplot()
avocado %>%
ggplot(aes(x = as.factor(type), y = averageprice)) +
geom_boxplot() +
coord_flip()
#must greater spread of prices for organic
var(avocado$averageprice)
[1] 0.1621484
mean(avocado$averageprice)
[1] 1.405978
sd(avocado$averageprice)
[1] 0.4026766
#assuming normally distributed averageprice
#approx 68 percent of avocados in our data sold at prices between $1.405978 - $0.4026766 = $1.003302 and $1.405978 + $0.4026766 = $1.808655
mean(avocado$averageprice) - sd(avocado$averageprice)
[1] 1.003302
mean(avocado$averageprice) + sd(avocado$averageprice)
[1] 1.808655
avocado %>%
ggplot(aes(x = averageprice, fill = type)) +
geom_histogram()
avocado %>%
ggplot(aes(x = averageprice, fill = type)) +
geom_histogram() +
facet_wrap(~ type)
#more conventional than organic sold but at generally lower price.
avocado %>%
ggplot(aes(x = region, y = averageprice, color = as.factor(year))) +
geom_point() +
coord_flip() +
facet_wrap(~ year)
# great deal of variability of price around region
avocado %>%
ggplot(aes(x = region, y = averageprice, color = as.factor(month))) +
geom_point() +
coord_flip() +
facet_wrap(~ month)
# great deal of variability of price around region and month
avocado %>%
ggplot(aes(x = region, y = averageprice, color = as.factor(year))) +
geom_point() +
coord_flip() +
facet_wrap(~ type)
# and variability around year too
avocado %>%
ggplot(aes(x = ymd(as.character(avocado$date)), y = averageprice, color = type)) +
geom_line() +
facet_wrap(~ type, ncol = 1)
region_table <- table(avocado$region)
round(prop.table(region_table) *100, digits = 1)
Albany Atlanta BaltimoreWashington Boise
1.9 1.9 1.9 1.9
Boston BuffaloRochester California Charlotte
1.9 1.9 1.9 1.9
Chicago CincinnatiDayton Columbus DallasFtWorth
1.9 1.9 1.9 1.9
Denver Detroit GrandRapids GreatLakes
1.9 1.9 1.9 1.9
HarrisburgScranton HartfordSpringfield Houston Indianapolis
1.9 1.9 1.9 1.9
Jacksonville LasVegas LosAngeles Louisville
1.9 1.9 1.9 1.9
MiamiFtLauderdale Midsouth Nashville NewOrleansMobile
1.9 1.9 1.9 1.9
NewYork Northeast NorthernNewEngland Orlando
1.9 1.9 1.9 1.9
Philadelphia PhoenixTucson Pittsburgh Plains
1.9 1.9 1.9 1.9
Portland RaleighGreensboro RichmondNorfolk Roanoke
1.9 1.9 1.9 1.9
Sacramento SanDiego SanFrancisco Seattle
1.9 1.9 1.9 1.9
SouthCarolina SouthCentral Southeast Spokane
1.9 1.9 1.9 1.9
StLouis Syracuse Tampa TotalUS
1.9 1.9 1.9 1.9
West WestTexNewMexico
1.9 1.8
type_table <- table(avocado$type)
round(prop.table(type_table) * 100, digits = 1)
conventional organic
50 50
table(avocado$year)
2015 2016 2017 2018
5615 5616 5722 1296
summary(avocado)
x date averageprice total.volume
Min. : 0.00 2015-01-04: 108 Min. :0.440 Min. : 85
1st Qu.:10.00 2015-01-11: 108 1st Qu.:1.100 1st Qu.: 10839
Median :24.00 2015-01-18: 108 Median :1.370 Median : 107377
Mean :24.23 2015-01-25: 108 Mean :1.406 Mean : 850644
3rd Qu.:38.00 2015-02-01: 108 3rd Qu.:1.660 3rd Qu.: 432962
Max. :52.00 2015-02-08: 108 Max. :3.250 Max. :62505647
(Other) :17601
x4046 x4225 x4770 total.bags
Min. : 0 Min. : 0 Min. : 0 Min. : 0
1st Qu.: 854 1st Qu.: 3009 1st Qu.: 0 1st Qu.: 5089
Median : 8645 Median : 29061 Median : 185 Median : 39744
Mean : 293008 Mean : 295155 Mean : 22840 Mean : 239639
3rd Qu.: 111020 3rd Qu.: 150207 3rd Qu.: 6243 3rd Qu.: 110783
Max. :22743616 Max. :20470573 Max. :2546439 Max. :19373134
small.bags large.bags xlarge.bags type
Min. : 0 Min. : 0 Min. : 0.0 conventional:9126
1st Qu.: 2849 1st Qu.: 127 1st Qu.: 0.0 organic :9123
Median : 26363 Median : 2648 Median : 0.0
Mean : 182195 Mean : 54338 Mean : 3106.4
3rd Qu.: 83338 3rd Qu.: 22029 3rd Qu.: 132.5
Max. :13384587 Max. :5719097 Max. :551693.7
year region month week
Min. :2015 Albany : 338 Min. : 1.000 Min. : 1.00
1st Qu.:2015 Atlanta : 338 1st Qu.: 3.000 1st Qu.:11.00
Median :2016 BaltimoreWashington: 338 Median : 6.000 Median :25.00
Mean :2016 Boise : 338 Mean : 6.177 Mean :25.24
3rd Qu.:2017 Boston : 338 3rd Qu.: 9.000 3rd Qu.:39.00
Max. :2018 BuffaloRochester : 338 Max. :12.000 Max. :53.00
(Other) :16221
library(psych)
pairs.panels(avocado[c("averageprice", "x4046", "x4225", "x4770", "small.bags", "large.bags", "xlarge.bags", "type", "region", "month", "week", "year")])
summary(avocado)
x date averageprice total.volume
Min. : 0.00 2015-01-04: 108 Min. :0.440 Min. : 85
1st Qu.:10.00 2015-01-11: 108 1st Qu.:1.100 1st Qu.: 10839
Median :24.00 2015-01-18: 108 Median :1.370 Median : 107377
Mean :24.23 2015-01-25: 108 Mean :1.406 Mean : 850644
3rd Qu.:38.00 2015-02-01: 108 3rd Qu.:1.660 3rd Qu.: 432962
Max. :52.00 2015-02-08: 108 Max. :3.250 Max. :62505647
(Other) :17601
x4046 x4225 x4770 total.bags
Min. : 0 Min. : 0 Min. : 0 Min. : 0
1st Qu.: 854 1st Qu.: 3009 1st Qu.: 0 1st Qu.: 5089
Median : 8645 Median : 29061 Median : 185 Median : 39744
Mean : 293008 Mean : 295155 Mean : 22840 Mean : 239639
3rd Qu.: 111020 3rd Qu.: 150207 3rd Qu.: 6243 3rd Qu.: 110783
Max. :22743616 Max. :20470573 Max. :2546439 Max. :19373134
small.bags large.bags xlarge.bags type
Min. : 0 Min. : 0 Min. : 0.0 conventional:9126
1st Qu.: 2849 1st Qu.: 127 1st Qu.: 0.0 organic :9123
Median : 26363 Median : 2648 Median : 0.0
Mean : 182195 Mean : 54338 Mean : 3106.4
3rd Qu.: 83338 3rd Qu.: 22029 3rd Qu.: 132.5
Max. :13384587 Max. :5719097 Max. :551693.7
year region month week
Min. :2015 Albany : 338 Min. : 1.000 Min. : 1.00
1st Qu.:2015 Atlanta : 338 1st Qu.: 3.000 1st Qu.:11.00
Median :2016 BaltimoreWashington: 338 Median : 6.000 Median :25.00
Mean :2016 Boise : 338 Mean : 6.177 Mean :25.24
3rd Qu.:2017 Boston : 338 3rd Qu.: 9.000 3rd Qu.:39.00
Max. :2018 BuffaloRochester : 338 Max. :12.000 Max. :53.00
(Other) :16221
# tidy up data
avocado_tidy <- avocado %>%
select(-c("x", "date", "total.volume", "total.bags"))
glimpse(avocado_tidy)
Observations: 18,249
Variables: 12
$ averageprice [3m[38;5;246m<dbl>[39m[23m 1.33, 1.35, 0.93, 1.08, 1.28, 1.26, 0.99, 0.98, 1.02, 1.07, 1.…
$ x4046 [3m[38;5;246m<dbl>[39m[23m 1036.74, 674.28, 794.70, 1132.00, 941.48, 1184.27, 1368.92, 70…
$ x4225 [3m[38;5;246m<dbl>[39m[23m 54454.85, 44638.81, 109149.67, 71976.41, 43838.39, 48067.99, 7…
$ x4770 [3m[38;5;246m<dbl>[39m[23m 48.16, 58.33, 130.50, 72.58, 75.78, 43.61, 93.26, 80.00, 85.34…
$ small.bags [3m[38;5;246m<dbl>[39m[23m 8603.62, 9408.07, 8042.21, 5677.40, 5986.26, 6556.47, 8196.81,…
$ large.bags [3m[38;5;246m<dbl>[39m[23m 93.25, 97.49, 103.14, 133.76, 197.69, 127.44, 122.05, 562.37, …
$ xlarge.bags [3m[38;5;246m<dbl>[39m[23m 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.…
$ type [3m[38;5;246m<fct>[39m[23m conventional, conventional, conventional, conventional, conven…
$ year [3m[38;5;246m<dbl>[39m[23m 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 2015, 20…
$ region [3m[38;5;246m<fct>[39m[23m Albany, Albany, Albany, Albany, Albany, Albany, Albany, Albany…
$ month [3m[38;5;246m<dbl>[39m[23m 12, 12, 12, 12, 11, 11, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9…
$ week [3m[38;5;246m<dbl>[39m[23m 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37…
# x is just reference so no predictive power, taken out date as I have year, week and month. total.volume and total.bags can be derived
# from the other data
# changing type to logical as there are only two types of avocado - didn't bother with this as noticed the model did all of this automatically.
#avocado_tidy$is.organic <- with(avocado_tidy, type=="organic")
# using alias to check is there are any aliased vairables | this result (I assume) means no aliased variables
alias(averageprice ~ ., data = avocado_tidy)
Model :
averageprice ~ x4046 + x4225 + x4770 + small.bags + large.bags +
xlarge.bags + type + year + region + month + week
avocado_tidy %>%
ggplot(aes(x = averageprice, y = type)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE)
library(caret)
# using K-fold cross validation
# using 10 folds
# first model has everything included!
cv_10_fold <- trainControl(method = "cv", number = 10, savePredictions = TRUE)
model <- train(averageprice ~ ., data = avocado_tidy,
trControl = cv_10_fold,
method = "lm")
model$pred
model$resample
mean(model$resample$RMSE)
[1] 0.2582568
mean(model$resample$Rsquared)
[1] 0.5887593
summary(model)
Call:
lm(formula = .outcome ~ ., data = dat)
Residuals:
Min 1Q Median 3Q Max
-1.03539 -0.15729 -0.00504 0.14804 1.51228
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -1.061e+02 4.269e+00 -24.849 < 2e-16 ***
x4046 1.034e-08 5.752e-09 1.799 0.072083 .
x4225 -1.084e-08 6.790e-09 -1.597 0.110331
x4770 -3.255e-09 4.421e-08 -0.074 0.941308
small.bags -1.777e-08 1.206e-08 -1.474 0.140629
large.bags -6.559e-08 2.101e-08 -3.121 0.001803 **
xlarge.bags 1.365e-06 2.037e-07 6.702 2.12e-11 ***
typeorganic 4.921e-01 4.036e-03 121.909 < 2e-16 ***
year 5.321e-02 2.117e-03 25.135 < 2e-16 ***
regionAtlanta -2.228e-01 1.985e-02 -11.224 < 2e-16 ***
regionBaltimoreWashington -2.373e-02 1.987e-02 -1.194 0.232363
regionBoise -2.131e-01 1.983e-02 -10.746 < 2e-16 ***
regionBoston -2.727e-02 1.986e-02 -1.373 0.169619
regionBuffaloRochester -4.383e-02 1.983e-02 -2.210 0.027107 *
regionCalifornia -1.736e-01 2.028e-02 -8.559 < 2e-16 ***
regionCharlotte 4.526e-02 1.984e-02 2.282 0.022522 *
regionChicago -2.472e-03 1.994e-02 -0.124 0.901307
regionCincinnatiDayton -3.495e-01 1.985e-02 -17.608 < 2e-16 ***
regionColumbus -3.090e-01 1.983e-02 -15.578 < 2e-16 ***
regionDallasFtWorth -4.763e-01 1.987e-02 -23.968 < 2e-16 ***
regionDenver -3.335e-01 1.995e-02 -16.713 < 2e-16 ***
regionDetroit -2.906e-01 1.987e-02 -14.624 < 2e-16 ***
regionGrandRapids -5.871e-02 1.984e-02 -2.960 0.003082 **
regionGreatLakes -2.265e-01 2.047e-02 -11.069 < 2e-16 ***
regionHarrisburgScranton -4.756e-02 1.983e-02 -2.398 0.016482 *
regionHartfordSpringfield 2.587e-01 1.984e-02 13.043 < 2e-16 ***
regionHouston -5.112e-01 1.987e-02 -25.734 < 2e-16 ***
regionIndianapolis -2.469e-01 1.983e-02 -12.448 < 2e-16 ***
regionJacksonville -5.003e-02 1.983e-02 -2.522 0.011665 *
regionLasVegas -1.785e-01 1.984e-02 -8.997 < 2e-16 ***
regionLosAngeles -3.555e-01 2.015e-02 -17.644 < 2e-16 ***
regionLouisville -2.741e-01 1.983e-02 -13.821 < 2e-16 ***
regionMiamiFtLauderdale -1.326e-01 1.986e-02 -6.678 2.49e-11 ***
regionMidsouth -1.472e-01 2.003e-02 -7.350 2.06e-13 ***
regionNashville -3.491e-01 1.983e-02 -17.603 < 2e-16 ***
regionNewOrleansMobile -2.584e-01 1.984e-02 -13.026 < 2e-16 ***
regionNewYork 1.746e-01 2.002e-02 8.721 < 2e-16 ***
regionNortheast 6.281e-02 2.145e-02 2.929 0.003407 **
regionNorthernNewEngland -8.176e-02 1.984e-02 -4.121 3.79e-05 ***
regionOrlando -5.500e-02 1.984e-02 -2.772 0.005570 **
regionPhiladelphia 7.308e-02 1.984e-02 3.683 0.000231 ***
regionPhoenixTucson -3.355e-01 1.989e-02 -16.867 < 2e-16 ***
regionPittsburgh -1.966e-01 1.983e-02 -9.915 < 2e-16 ***
regionPlains -1.258e-01 1.989e-02 -6.328 2.54e-10 ***
regionPortland -2.399e-01 1.985e-02 -12.083 < 2e-16 ***
regionRaleighGreensboro -5.601e-03 1.984e-02 -0.282 0.777695
regionRichmondNorfolk -2.697e-01 1.983e-02 -13.600 < 2e-16 ***
regionRoanoke -3.132e-01 1.983e-02 -15.791 < 2e-16 ***
regionSacramento 6.037e-02 1.983e-02 3.044 0.002337 **
regionSanDiego -1.623e-01 1.984e-02 -8.183 2.96e-16 ***
regionSanFrancisco 2.444e-01 1.985e-02 12.312 < 2e-16 ***
regionSeattle -1.148e-01 1.985e-02 -5.784 7.43e-09 ***
regionSouthCarolina -1.579e-01 1.984e-02 -7.963 1.78e-15 ***
regionSouthCentral -4.613e-01 2.053e-02 -22.466 < 2e-16 ***
regionSoutheast -1.614e-01 2.038e-02 -7.920 2.51e-15 ***
regionSpokane -1.154e-01 1.983e-02 -5.819 6.03e-09 ***
regionStLouis -1.309e-01 1.983e-02 -6.600 4.23e-11 ***
regionSyracuse -4.080e-02 1.983e-02 -2.058 0.039646 *
regionTampa -1.521e-01 1.984e-02 -7.664 1.89e-14 ***
regionTotalUS -1.879e-01 2.446e-02 -7.685 1.61e-14 ***
regionWest -2.554e-01 2.067e-02 -12.355 < 2e-16 ***
regionWestTexNewMexico -2.962e-01 1.992e-02 -14.873 < 2e-16 ***
month -2.037e-02 6.578e-03 -3.097 0.001956 **
week 9.460e-03 1.500e-03 6.307 2.90e-10 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.2578 on 18185 degrees of freedom
Multiple R-squared: 0.5916, Adjusted R-squared: 0.5902
F-statistic: 418.1 on 63 and 18185 DF, p-value: < 2.2e-16
model <- lm(averageprice ~ ., data = avocado_tidy)
par(mfrow = c(2, 2))
plot(model)
looking for a lower error (RMSE) and a higher R squared value
cv_10_fold <- trainControl(method = "cv", number = 10, savePredictions = TRUE)
# much simpler model
model1 <- train(averageprice ~ type + region,
data = avocado_tidy,
trControl = cv_10_fold,
method = "lm")
mean(model1$resample$RMSE)
[1] 0.2716248
mean(model1$resample$Rsquared)
[1] 0.5450222
wrong direction
cv_10_fold <- trainControl(method = "cv", number = 10, savePredictions = TRUE)
# even simpler
model2 <- train(averageprice ~ type,
data = avocado_tidy,
trControl = cv_10_fold,
method = "lm")
mean(model2$resample$RMSE)
[1] 0.3172512
mean(model2$resample$Rsquared)
[1] 0.3793824
even worse
cv_10_fold <- trainControl(method = "cv", number = 10, savePredictions = TRUE)
model3 <- train(averageprice ~ type + month,
data = avocado_tidy,
trControl = cv_10_fold,
method = "lm")
mean(model3$resample$RMSE)
[1] 0.3104094
mean(model3$resample$Rsquared)
[1] 0.4058502
error reduced slightly and R squared increased
cv_10_fold <- trainControl(method = "cv", number = 10, savePredictions = TRUE)
model4 <- train(averageprice ~ type + year + region + week + month,
data = avocado_tidy,
trControl = cv_10_fold,
method = "lm")
mean(model4$resample$RMSE)
[1] 0.2585822
mean(model4$resample$Rsquared)
[1] 0.5873909
error reduced slightly further and R squared increased quite a bit
model4$resample
cv_10_fold <- trainControl(method = "cv", number = 10, savePredictions = TRUE)
model5 <- train(log(averageprice) ~ type + year + region + week + month,
data = avocado_tidy,
trControl = cv_10_fold,
method = "lm")
mean(model5$resample$RMSE)
[1] 0.1828585
mean(model5$resample$Rsquared)
[1] 0.6016292
model5$resample
cv_10_fold <- trainControl(method = "cv", number = 10, savePredictions = TRUE)
model6 <- train(log(averageprice) ~ type + year + region + week + month + region:type,
data = avocado_tidy,
trControl = cv_10_fold,
method = "lm")
mean(model6$resample$RMSE)
[1] 0.1730975
mean(model6$resample$Rsquared)
[1] 0.6432118
cv_10_fold <- trainControl(method = "cv", number = 10, savePredictions = TRUE)
# adding an interaction
model7 <- train(log(averageprice) ~ type + year + region + week + month + region:type + region:month,
data = avocado_tidy,
trControl = cv_10_fold,
method = "lm")
mean(model7$resample$RMSE)
[1] 0.1713983
mean(model7$resample$Rsquared)
[1] 0.6501079
cv_10_fold <- trainControl(method = "cv", number = 10, savePredictions = TRUE)
# adding a further interaction
model8 <- train(log(averageprice) ~ type + year + region + week + month + region:type + region:week,
data = avocado_tidy,
trControl = cv_10_fold,
method = "lm")
mean(model8$resample$RMSE)
[1] 0.1714282
mean(model8$resample$Rsquared)
[1] 0.6498005
cv_10_fold <- trainControl(method = "cv", number = 10, savePredictions = TRUE)
# adding a further interaction
model9 <- train(log(averageprice) ~ type + year + region + week + month + large.bags + region:type + region:week,
data = avocado_tidy,
trControl = cv_10_fold,
method = "lm")
mean(model9$resample$RMSE)
[1] 0.1711208
mean(model9$resample$Rsquared)
[1] 0.6511975
cv_10_fold <- trainControl(method = "cv", number = 10, savePredictions = TRUE)
# adding a further interaction
model10 <- train(log(averageprice) ~ type + year + region + week + month + large.bags + x4046 + region:type + region:week,
data = avocado_tidy,
trControl = cv_10_fold,
method = "lm")
mean(model10$resample$RMSE)
[1] 0.1706929
mean(model10$resample$Rsquared)
[1] 0.6529101
model_best <- lm(log(averageprice) ~ type + year + region + week + month + large.bags + region:type + region:week, data = avocado_tidy)
broom::glance(model_best)